In this case study, we are going to build a classifier to calculate the probability of a customer defaulting their credit card bills.
credit-default.csv
Each row is about a customer. We have details about their savings, employment, age, marital status etc. In default column (target column), we have value 1, if the customer has not defaulted and the value 2, if the customer has defaulted.
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, auc, roc_curve
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
pd.set_option('display.max_row', 100000)
pd.set_option('display.max_columns',500000)
dt=pd.read_csv('credit-default.csv')
dt.head()
dt.columns
dt.shape
dt.describe()
dt.isnull().sum()
dt.dtypes
dt['default'] = dt['default'].replace({1:0, 2:1})
plt.figure(figsize=(20,20))
sns.heatmap(dt.corr(),annot=True,cmap='coolwarm')
def histogram(data,path,color,title,xaxis,yaxis):
fig = px.histogram(data, x=path,color=color)
fig.update_layout(
title_text=title,
xaxis_title_text=xaxis,
yaxis_title_text=yaxis,
bargap=0.2,
bargroupgap=0.1
)
fig.show()
dt_new=dt[dt['default']==0]
dt_new=dt_new.groupby('default')['credit_history'].value_counts(normalize=True)
dt_new = dt_new.mul(100).rename('Percent').reset_index()
dt_new['Percent']=dt_new['Percent'].round(decimals=2)
dt_new
px.bar(dt_new, x='default', y='Percent', color='credit_history',title="Default as '0' w.r.t Credit History"
,barmode='group', text='Percent')
dt_new=dt[dt['default']==1]
dt_new=dt_new.groupby('default')['credit_history'].value_counts(normalize=True)
dt_new = dt_new.mul(100).rename('Percent').reset_index()
dt_new['Percent']=dt_new['Percent'].round(decimals=2)
px.bar(dt_new, x='default', y='Percent', color='credit_history',title="Default as '1' w.r.t Credit History"
,barmode='group', text='Percent')
histogram(dt,"credit_history","default",'Default on Credit History','Credit History','Count')
histogram(dt,"age","default",'Age count on default','Age Distribution','Count')
histogram(dt,"months_loan_duration","default",'Months loan count on default','Months Distribution','Count')
dt_new=dt[dt['default']==1]
dt_new=dt_new.groupby('default')['purpose'].value_counts(normalize=True)
dt_new = dt_new.mul(100).rename('Percent').reset_index()
dt_new['Percent']=dt_new['Percent'].round(decimals=2)
px.bar(dt_new, x='default', y='Percent', color='purpose',title="Default as '1' w.r.t purpose"
,barmode='group', text='Percent')
dt_new=dt[dt['default']==0]
dt_new=dt_new.groupby('default')['purpose'].value_counts(normalize=True)
dt_new = dt_new.mul(100).rename('Percent').reset_index()
dt_new['Percent']=dt_new['Percent'].round(decimals=2)
px.bar(dt_new, x='default', y='Percent', color='purpose',title="Default as '0' w.r.t purpose"
,barmode='group', text='Percent')
histogram(dt,"purpose","default",'Default on purpose','Purpose','Count')
histogram(dt,"amount","default",'Default on amount','Amount','Count')
dt_new=dt[dt['default']==1]
dt_new=dt_new.groupby('default')['savings_balance'].value_counts(normalize=True)
dt_new = dt_new.mul(100).rename('Percent').reset_index()
dt_new['Percent']=dt_new['Percent'].round(decimals=2)
px.bar(dt_new, x='default', y='Percent', color='savings_balance',title="Default as '1' w.r.t savings balance"
,barmode='group', text='Percent')
dt_new=dt[dt['default']==0]
dt_new=dt_new.groupby('default')['savings_balance'].value_counts(normalize=True)
dt_new = dt_new.mul(100).rename('Percent').reset_index()
dt_new['Percent']=dt_new['Percent'].round(decimals=2)
px.bar(dt_new, x='default', y='Percent', color='savings_balance',title="Default as '0' w.r.t savings balance"
,barmode='group', text='Percent')
histogram(dt,"savings_balance","default",'Default on savings balance','savings_balance','Count')
histogram(dt,"employment_length","default",'Default on employment length','employment_length','Count')
histogram(dt,"installment_rate","default",'Default on installment_rate','installment_rate','Count')
dt_new=dt[dt['default']==1]
dt_new=dt_new.groupby('default')['personal_status'].value_counts(normalize=True)
dt_new = dt_new.mul(100).rename('Percent').reset_index()
dt_new['Percent']=dt_new['Percent'].round(decimals=2)
px.bar(dt_new, x='default', y='Percent', color='personal_status',title="Default as '1' w.r.t personal status"
,barmode='group', text='Percent')
dt_new=dt[dt['default']==0]
dt_new=dt_new.groupby('default')['personal_status'].value_counts(normalize=True)
dt_new = dt_new.mul(100).rename('Percent').reset_index()
dt_new['Percent']=dt_new['Percent'].round(decimals=2)
px.bar(dt_new, x='default', y='Percent', color='personal_status',title="Default as '0' w.r.t personal status"
,barmode='group', text='Percent')
histogram(dt,"personal_status","default",'Default on personal status','personal status','Count')
histogram(dt,"other_debtors","default",'Default on other debtors','other debtors','Count')
dt_new=dt[dt['default']==1]
dt_new=dt_new.groupby('default')['property'].value_counts(normalize=True)
dt_new = dt_new.mul(100).rename('Percent').reset_index()
dt_new['Percent']=dt_new['Percent'].round(decimals=2)
px.bar(dt_new, x='default', y='Percent', color='property',title="Default as '1' w.r.t property"
,barmode='group', text='Percent')
dt_new=dt[dt['default']==0]
dt_new=dt_new.groupby('default')['property'].value_counts(normalize=True)
dt_new = dt_new.mul(100).rename('Percent').reset_index()
dt_new['Percent']=dt_new['Percent'].round(decimals=2)
px.bar(dt_new, x='default', y='Percent', color='property',title="Default as '0' w.r.t property"
,barmode='group', text='Percent')
#### Observation
* **This graph shows property type ploting with default value as 0**
histogram(dt,"property","default",'Default on property','property','Count')
dt_new=dt[dt['default']==1]
dt_new=dt_new.groupby('default')['installment_plan'].value_counts(normalize=True)
dt_new = dt_new.mul(100).rename('Percent').reset_index()
dt_new['Percent']=dt_new['Percent'].round(decimals=2)
px.bar(dt_new, x='default', y='Percent', color='installment_plan',title="Default as '1' w.r.t installment_plan"
,barmode='group', text='Percent')
dt_new=dt[dt['default']==0]
dt_new=dt_new.groupby('default')['installment_plan'].value_counts(normalize=True)
dt_new = dt_new.mul(100).rename('Percent').reset_index()
dt_new['Percent']=dt_new['Percent'].round(decimals=2)
px.bar(dt_new, x='default', y='Percent', color='installment_plan',title="Default as '0' w.r.t installment_plan"
,barmode='group', text='Percent')
histogram(dt,"installment_plan","default",'Default on installment_plan','installment_plan','Count')
histogram(dt,"housing","default",'Default on housing','housing','Count')
dt_new=dt[dt['default']==1]
dt_new=dt_new.groupby('default')['job'].value_counts(normalize=True)
dt_new = dt_new.mul(100).rename('Percent').reset_index()
dt_new['Percent']=dt_new['Percent'].round(decimals=2)
px.bar(dt_new, x='default', y='Percent', color='job',title="Default as '1' w.r.t job"
,barmode='group', text='Percent')
dt_new=dt[dt['default']==0]
dt_new=dt_new.groupby('default')['job'].value_counts(normalize=True)
dt_new = dt_new.mul(100).rename('Percent').reset_index()
dt_new['Percent']=dt_new['Percent'].round(decimals=2)
px.bar(dt_new, x='default', y='Percent', color='job',title="Default as '0' w.r.t job"
,barmode='group', text='Percent')
histogram(dt,"job","default",'Default on job','Job','Count')
def correlation_feature(dataset, threshold):
col_corr = set() # Set of all the names of correlated columns
corr_matrix = dataset.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
colname = corr_matrix.columns[i] # getting the name of column
col_corr.add(colname)
return col_corr
corr_features = correlation_feature(dt, 0.7)
len(set(corr_features))
corr_features
dt.describe(include='object')
label_encoder=LabelEncoder()
dt['checking_balance'] = label_encoder.fit_transform(dt['checking_balance'])
dt['credit_history'] = label_encoder.fit_transform(dt['credit_history'])
dt['purpose'] = label_encoder.fit_transform(dt['purpose'])
dt['savings_balance'] = label_encoder.fit_transform(dt['savings_balance'])
dt['employment_length'] = label_encoder.fit_transform(dt['employment_length'])
dt['personal_status'] = label_encoder.fit_transform(dt['personal_status'])
dt['other_debtors'] = label_encoder.fit_transform(dt['other_debtors'])
dt['property'] = label_encoder.fit_transform(dt['property'])
dt['installment_plan'] = label_encoder.fit_transform(dt['installment_plan'])
dt['housing'] = label_encoder.fit_transform(dt['housing'])
dt['telephone'] = label_encoder.fit_transform(dt['telephone'])
dt['foreign_worker'] = label_encoder.fit_transform(dt['foreign_worker'])
dt['job'] = label_encoder.fit_transform(dt['job'])
dt.dtypes
dt.head()
x= dt.drop(['default'],axis =1)
y = dt['default']
from sklearn.feature_selection import SelectKBest,f_classif
from sklearn.feature_selection import chi2
ordered_rank_features=SelectKBest(score_func=chi2,k=10)
ordered_feature=ordered_rank_features.fit(x,y)
dfscores=pd.DataFrame(ordered_feature.scores_,columns=["Score"])
dfcolumns=pd.DataFrame(x1.columns)
features_rank=pd.concat([dfcolumns,dfscores],axis=1,sort=True)
features_rank.columns=['Features','Score']
features_rank
features_rank.nlargest(15,'Score')
By chi2 method we detect the above parameter which are highly co-related with the target parameter
from sklearn.ensemble import ExtraTreesRegressor
model=ExtraTreesRegressor()
model.fit(x,y)
print(model.feature_importances_)
fet_import=pd.Series(model.feature_importances_,index=x1.columns)
fet_import.nlargest(15).plot(kind='barh')
plt.show()
fet_import.nlargest(15)
We also applied the Extratree classifier method to detect the highly co-related parameter with the target paramter
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)
x_train.shape,y_train.shape,x_test.shape,y_test.shape
model_lr=LogisticRegression()
model_lr.fit(x_train,y_train)
predict_lr_tr=model_lr.predict(x_train)
predict_lr_test=model_lr.predict(x_test)
cm=confusion_matrix(y_train,predict_lr_tr)
sns.heatmap(cm,annot=True,fmt="d")
cm1=confusion_matrix(y_test,predict_lr_test)
sns.heatmap(cm1,annot=True,fmt='d')
print(accuracy_score(y_train,predict_lr_tr))
print(classification_report(y_train,predict_lr_tr))
print(accuracy_score(y_test,predict_lr_test))
print(classification_report(y_test,predict_lr_test))
As We know when the accuracy score is high it shows that our model is working fine
Here the accuracy score is only 68% hence we will try different algorithm to train our model
model_rf=RandomForestClassifier()
model_rf.fit(x_train,y_train)
predict_rf_tr=model_rf.predict(x_train)
predict_rf_test=model_rf.predict(x_test)
cm=confusion_matrix(y_train,predict_rf_tr)
sns.heatmap(cm,annot=True,fmt="d")
cm1=confusion_matrix(y_test,predict_rf_test)
sns.heatmap(cm1,annot=True,fmt='d')
print(accuracy_score(y_train,predict_rf_tr))
print(classification_report(y_train,predict_rf_tr))
print(accuracy_score(y_test,predict_rf_test))
print(classification_report(y_test,predict_rf_test))
After applying random forest we can see the accuracy score has improved but we will try to improve more
from sklearn.model_selection import GridSearchCV
model_params = {
'n_estimators': [50, 150, 250],
'max_features': ['sqrt', 0.25, 0.5, 0.75, 1.0],
'min_samples_split': [2, 4, 6]
}
rf_model = RandomForestClassifier(random_state=1)
clf = GridSearchCV(rf_model, model_params, cv=5)
model = clf.fit(x_train,y_train)
grid_predict=model.predict(x_test)
cm1=confusion_matrix(y_test,grid_predict)
sns.heatmap(cm1,annot=True,fmt='d')
print(accuracy_score(y_test,grid_predict))
print(classification_report(y_test,grid_predict))
After fine tunning the model we don't see much improvement in our accuracy.
Now we will scale the dataset in order to improve the accuracy
min_dt=dt.min()
range_dt=(dt-min_dt).max()
dt_scaled = (dt-min_dt)/range_dt
x= dt_scaled.drop(['default'],axis =1)
y = dt_scaled['default']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)
model_lr=LogisticRegression()
model_lr.fit(x_train,y_train)
predict_lr_tr=model_lr.predict(x_train)
predict_lr_test=model_lr.predict(x_test)
cm=confusion_matrix(y_train,predict_lr_tr)
sns.heatmap(cm,annot=True,fmt="d")
cm1=confusion_matrix(y_test,predict_lr_test)
sns.heatmap(cm1,annot=True,fmt='d')
print(accuracy_score(y_train,predict_lr_tr))
print(classification_report(y_train,predict_lr_tr))
print(accuracy_score(y_test,predict_lr_test))
print(classification_report(y_test,predict_lr_test))
from sklearn.ensemble import RandomForestClassifier
model_rf=RandomForestClassifier()
model_rf.fit(x_train,y_train)
predict_rf_tr=model_rf.predict(x_train)
predict_rf_test=model_rf.predict(x_test)
cm=confusion_matrix(y_train,predict_rf_tr)
sns.heatmap(cm,annot=True,fmt="d")
cm1=confusion_matrix(y_test,predict_rf_test)
sns.heatmap(cm1,annot=True,fmt='d')
print(accuracy_score(y_train,predict_rf_tr))
print(classification_report(y_train,predict_rf_tr))
print(accuracy_score(y_test,predict_rf_test))
print(classification_report(y_test,predict_rf_test))
from sklearn.model_selection import GridSearchCV
model_params = {
'n_estimators': [50, 150, 250],
'max_features': ['sqrt', 0.25, 0.5, 0.75, 1.0],
'min_samples_split': [2, 4, 6]
}
rf_model = RandomForestClassifier(random_state=1)
clf = GridSearchCV(rf_model, model_params, cv=5)
model = clf.fit(x_train,y_train)
rf_model = RandomForestClassifier(random_state=1)
clf = GridSearchCV(rf_model, model_params, cv=5)
model = clf.fit(x_train,y_train)
grid_predict=model.predict(x_test)
cm1=confusion_matrix(y_test,grid_predict)
sns.heatmap(cm1,annot=True,fmt='d')
print(accuracy_score(y_test,grid_predict))
print(classification_report(y_test,grid_predict))